In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy as sp
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [7]:
df = pd.DataFrame(pd.read_csv("downloads/diabetes.csv"))
In [11]:
#displaying the head of dataset
df.head(6)
Out[11]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| 5 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
In [13]:
#description of dataset
df.describe()
Out[13]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [15]:
df.columns
Out[15]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
In [17]:
df.shape
Out[17]:
(768, 9)
In [21]:
#checking for null values
df.isnull().sum()
Out[21]:
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
In [23]:
df.isnull().any()
Out[23]:
Pregnancies False Glucose False BloodPressure False SkinThickness False Insulin False BMI False DiabetesPedigreeFunction False Age False Outcome False dtype: bool
In [25]:
df.isnull().all()
Out[25]:
Pregnancies False Glucose False BloodPressure False SkinThickness False Insulin False BMI False DiabetesPedigreeFunction False Age False Outcome False dtype: bool
In [49]:
#It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values
df_new = df.copy(deep = True)
df_new[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction','Age']] = df_new[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction','Age']].replace(0,np.NaN)
## showing the count of Nans
print(df_new.isnull().sum())
Pregnancies 0 Glucose 5 BloodPressure 35 SkinThickness 227 Insulin 374 BMI 11 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
In [35]:
df_new = df.std()
In [37]:
df_new
Out[37]:
Pregnancies 3.369578 Glucose 31.972618 BloodPressure 19.355807 SkinThickness 15.952218 Insulin 115.244002 BMI 7.884160 DiabetesPedigreeFunction 0.331329 Age 11.760232 Outcome 0.476951 dtype: float64
In [39]:
df_new.mean()
Out[39]:
22.927432797872843
In [41]:
df_new.isnull().any()
Out[41]:
False
In [66]:
# EDA
In [68]:
df.corr()
Out[68]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| Pregnancies | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | 0.221898 |
| Glucose | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.466581 |
| BloodPressure | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.065068 |
| SkinThickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 0.074752 |
| Insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.130548 |
| BMI | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.292695 |
| DiabetesPedigreeFunction | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.173844 |
| Age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | 0.238356 |
| Outcome | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 1.000000 |
In [72]:
#A **heat map** is a two-dimensional representation of information with the help of colors. Heat maps can help the user visualize simple or complex information.
#correlation
# we can see skin thickness,insulin,pregnencies and age are full independent to each other
#age and pregencies has negative correlation
sns.heatmap(df.corr(),annot = True)
Out[72]:
<Axes: >
In [74]:
#Histogram
df.hist(figsize = (10,10))
plt.show()
In [76]:
sns.set(style="ticks")
sns.pairplot(df, hue="Outcome")
Out[76]:
<seaborn.axisgrid.PairGrid at 0x175a75460>
In [78]:
#box plot for outlier visualization
sns.set(style="whitegrid")
df.boxplot(figsize=(15,6))
Out[78]:
<Axes: >
In [82]:
#box plot
sns.set(style="whitegrid")
sns.boxplot(x=df['Insulin'])
plt.show()
sns.boxplot(x=df['BloodPressure'])
plt.show()
sns.boxplot(x=df['DiabetesPedigreeFunction'])
plt.show()
In [84]:
#outlier remove
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1
print("---Q1--- \n",Q1)
print("\n---Q3--- \n",Q3)
print("\n---IQR---\n",IQR)
---Q1--- Pregnancies 1.00000 Glucose 99.00000 BloodPressure 62.00000 SkinThickness 0.00000 Insulin 0.00000 BMI 27.30000 DiabetesPedigreeFunction 0.24375 Age 24.00000 Outcome 0.00000 Name: 0.25, dtype: float64 ---Q3--- Pregnancies 6.00000 Glucose 140.25000 BloodPressure 80.00000 SkinThickness 32.00000 Insulin 127.25000 BMI 36.60000 DiabetesPedigreeFunction 0.62625 Age 41.00000 Outcome 1.00000 Name: 0.75, dtype: float64 ---IQR--- Pregnancies 5.0000 Glucose 41.2500 BloodPressure 18.0000 SkinThickness 32.0000 Insulin 127.2500 BMI 9.3000 DiabetesPedigreeFunction 0.3825 Age 17.0000 Outcome 1.0000 dtype: float64
In [86]:
#outlier remove
df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape,df_out.shape
#more than 80 records deleted
Out[86]:
((768, 9), (639, 9))
In [88]:
#Scatter matrix after removing outlier
sns.set(style="ticks")
sns.pairplot(df_out, hue="Outcome")
plt.show()
In [90]:
# visualizing distribution of Y feature (predictive variable)
plt.figure(figsize=(10,7))
df_out.Outcome.value_counts().sort_index().plot.bar()
diabetic_rate = df_out.Outcome.mean()
plt.title(f"Overall diabetes diagnosis rate: {diabetic_rate:.2%}", size=17)
plt.xlabel('Is diabetic?', size=17)
plt.ylabel('Count of Patients', size=17)
Out[90]:
Text(0, 0.5, 'Count of Patients')
In [92]:
df.shape
Out[92]:
(768, 9)
In [94]:
df.head()
Out[94]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [96]:
#lets extract features and targets
X=df_out.drop(columns=['Outcome'])
y=df_out['Outcome']
In [98]:
#Splitting train test data 80 20 ratio
from sklearn.model_selection import train_test_split
In [100]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)
In [102]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[102]:
((511, 8), (128, 8), (511,), (128,))
In [104]:
from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer
from sklearn.model_selection import cross_validate
def tn(y_test, y_pred): return confusion_matrix(y_test, y_train)[0, 0]
def fp(y_test, y_pred): return confusion_matrix(y_test, y_pred)[0, 1]
def fn(y_test, y_pred): return confusion_matrix(y_test, y_pred)[1, 0]
def tp(y_test, y_pred): return confusion_matrix(y_test, y_pred)[1, 1]
In [106]:
#cross validation purpose
scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}
scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),
'fp': make_scorer(fp), 'fn': make_scorer(fn)}
def display_result(result):
print("TP: ",result['test_tp'])
print("TN: ",result['test_tn'])
print("FN: ",result['test_fn'])
print("FP: ",result['test_fp'])
In [108]:
#Perform 2 sample z-test
from statsmodels.stats.weightstats import ztest as ztest
ztest(X_train['BMI'],X_train['Age'],value=0)
Out[108]:
(-1.0803107438930162, 0.2800038269669761)
In [112]:
#Lets build the model
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Fit the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
# Predict
y_predict = model.predict(X_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_predict)
# Plot confusion matrix using ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
# Show the score
model_score = model.score(X_test, y_test)
print(f"Model Score: {model_score}")
Model Score: 0.765625
In [128]:
accuracy = metrics.accuracy_score(y_test, y_predict)
print("Accuracy: %.3f" % accuracy)
precision = metrics.precision_score(y_test, y_predict)
print("Precision: %.3f" % precision)
recall = metrics.recall_score(y_test, y_predict)
print("Recall: %.3f" % recall)
f1 = metrics.f1_score(y_test, y_predict)
print("F1 Score: %.3f" % f1)
Accuracy: 0.766 Precision: 0.643 Recall: 0.474 F1 Score: 0.545
In [154]:
pip install xgboost
Collecting xgboost Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB) Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (from xgboost) (1.26.4) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.12/site-packages (from xgboost) (1.13.1) Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl (1.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 595.6 kB/s eta 0:00:0000:0100:01 Installing collected packages: xgboost Successfully installed xgboost-2.1.1 Note: you may need to restart the kernel to use updated packages.
In [156]:
conda install -c conda-forge xgboost
Channels:
- conda-forge
- defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done
## Package Plan ##
environment location: /opt/anaconda3
added / updated specs:
- xgboost
The following packages will be downloaded:
package | build
---------------------------|-----------------
_py-xgboost-mutex-2.0 | cpu_0 12 KB conda-forge
ca-certificates-2024.8.30 | hf0a4a13_0 155 KB conda-forge
certifi-2024.8.30 | pyhd8ed1ab_0 160 KB conda-forge
conda-24.7.1 | py312h81bd7bf_0 1.2 MB conda-forge
libcxx-18.1.8 | h3ed4263_7 427 KB conda-forge
libexpat-2.6.2 | hebf3989_0 62 KB conda-forge
libsqlite-3.46.0 | hfb93653_0 811 KB conda-forge
libxgboost-2.1.1 | cpu_h27903ac_2 1.3 MB conda-forge
libzlib-1.2.13 | hfb2fe0b_6 46 KB conda-forge
llvm-openmp-18.1.8 | hde57baf_1 270 KB conda-forge
openssl-3.3.2 | h8359307_0 2.7 MB conda-forge
py-xgboost-2.1.1 | cpu_pyhb442362_2 132 KB conda-forge
python-3.12.2 |hdf0ec26_0_cpython 12.5 MB conda-forge
python_abi-3.12 | 5_cp312 6 KB conda-forge
xgboost-2.1.1 | cpu_pyhb8f9a19_2 15 KB conda-forge
zlib-1.2.13 | hfb2fe0b_6 76 KB conda-forge
------------------------------------------------------------
Total: 19.8 MB
The following NEW packages will be INSTALLED:
_py-xgboost-mutex conda-forge/osx-arm64::_py-xgboost-mutex-2.0-cpu_0
libexpat conda-forge/osx-arm64::libexpat-2.6.2-hebf3989_0
libsqlite conda-forge/osx-arm64::libsqlite-3.46.0-hfb93653_0
libxgboost conda-forge/osx-arm64::libxgboost-2.1.1-cpu_h27903ac_2
libzlib conda-forge/osx-arm64::libzlib-1.2.13-hfb2fe0b_6
py-xgboost conda-forge/noarch::py-xgboost-2.1.1-cpu_pyhb442362_2
python_abi conda-forge/osx-arm64::python_abi-3.12-5_cp312
xgboost conda-forge/noarch::xgboost-2.1.1-cpu_pyhb8f9a19_2
The following packages will be UPDATED:
ca-certificates pkgs/main::ca-certificates-2024.7.2-h~ --> conda-forge::ca-certificates-2024.8.30-hf0a4a13_0
libcxx pkgs/main::libcxx-14.0.6-h848a8c0_0 --> conda-forge::libcxx-18.1.8-h3ed4263_7
llvm-openmp pkgs/main::llvm-openmp-14.0.6-hc6e570~ --> conda-forge::llvm-openmp-18.1.8-hde57baf_1
openssl pkgs/main::openssl-3.0.15-h80987f9_0 --> conda-forge::openssl-3.3.2-h8359307_0
zlib pkgs/main::zlib-1.2.13-h18a0788_1 --> conda-forge::zlib-1.2.13-hfb2fe0b_6
The following packages will be SUPERSEDED by a higher-priority channel:
certifi pkgs/main/osx-arm64::certifi-2024.8.3~ --> conda-forge/noarch::certifi-2024.8.30-pyhd8ed1ab_0
conda pkgs/main::conda-24.7.1-py312hca03da5~ --> conda-forge::conda-24.7.1-py312h81bd7bf_0
python pkgs/main::python-3.12.4-h99e199e_1 --> conda-forge::python-3.12.2-hdf0ec26_0_cpython
Downloading and Extracting Packages:
python-3.12.2 | 12.5 MB | | 0%
openssl-3.3.2 | 2.7 MB | | 0%
libxgboost-2.1.1 | 1.3 MB | | 0%
conda-24.7.1 | 1.2 MB | | 0%
libsqlite-3.46.0 | 811 KB | | 0%
libcxx-18.1.8 | 427 KB | | 0%
llvm-openmp-18.1.8 | 270 KB | | 0%
certifi-2024.8.30 | 160 KB | | 0%
ca-certificates-2024 | 155 KB | | 0%
py-xgboost-2.1.1 | 132 KB | | 0%
zlib-1.2.13 | 76 KB | | 0%
libexpat-2.6.2 | 62 KB | | 0%
libzlib-1.2.13 | 46 KB | | 0%
xgboost-2.1.1 | 15 KB | | 0%
_py-xgboost-mutex-2. | 12 KB | | 0%
python-3.12.2 | 12.5 MB | | 0%
libsqlite-3.46.0 | 811 KB | 7 | 2%
openssl-3.3.2 | 2.7 MB | 2 | 1%
python-3.12.2 | 12.5 MB | 3 | 1%
openssl-3.3.2 | 2.7 MB | 6 | 2%
libsqlite-3.46.0 | 811 KB | #####1 | 14%
python-3.12.2 | 12.5 MB | 7 | 2%
libsqlite-3.46.0 | 811 KB | ###############3 | 41%
python-3.12.2 | 12.5 MB | #3 | 4%
libxgboost-2.1.1 | 1.3 MB | ######################4 | 61%
python-3.12.2 | 12.5 MB | #7 | 5%
openssl-3.3.2 | 2.7 MB | #4 | 4%
libxgboost-2.1.1 | 1.3 MB | #############################9 | 81%
libsqlite-3.46.0 | 811 KB | ########################### | 73%
openssl-3.3.2 | 2.7 MB | #6 | 5%
libxgboost-2.1.1 | 1.3 MB | ##################################### | 100%
libxgboost-2.1.1 | 1.3 MB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ## | 6%
libcxx-18.1.8 | 427 KB | #3 | 4%
libsqlite-3.46.0 | 811 KB | ##############################6 | 83%
python-3.12.2 | 12.5 MB | ##3 | 6%
libcxx-18.1.8 | 427 KB | #####5 | 15%
python-3.12.2 | 12.5 MB | ### | 8%
openssl-3.3.2 | 2.7 MB | ###9 | 11%
libcxx-18.1.8 | 427 KB | ################6 | 45%
python-3.12.2 | 12.5 MB | ###2 | 9%
llvm-openmp-18.1.8 | 270 KB | ##1 | 6%
conda-24.7.1 | 1.2 MB | 4 | 1%
libcxx-18.1.8 | 427 KB | ##################################### | 100%
libcxx-18.1.8 | 427 KB | ##################################### | 100%
certifi-2024.8.30 | 160 KB | ###7 | 10%
openssl-3.3.2 | 2.7 MB | ######3 | 17%
certifi-2024.8.30 | 160 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ####1 | 11%
llvm-openmp-18.1.8 | 270 KB | ###############3 | 42%
llvm-openmp-18.1.8 | 270 KB | ########################1 | 65%
conda-24.7.1 | 1.2 MB | #####4 | 15%
python-3.12.2 | 12.5 MB | ####5 | 12%
llvm-openmp-18.1.8 | 270 KB | ################################9 | 89%
conda-24.7.1 | 1.2 MB | #################3 | 47%
python-3.12.2 | 12.5 MB | ##### | 14%
openssl-3.3.2 | 2.7 MB | ######### | 24%
python-3.12.2 | 12.5 MB | #####6 | 15%
py-xgboost-2.1.1 | 132 KB | ####4 | 12%
openssl-3.3.2 | 2.7 MB | ##########3 | 28%
python-3.12.2 | 12.5 MB | ######1 | 17%
ca-certificates-2024 | 155 KB | ###8 | 10%
openssl-3.3.2 | 2.7 MB | ###########1 | 30%
ca-certificates-2024 | 155 KB | ###################1 | 52%
ca-certificates-2024 | 155 KB | ##################################### | 100%
py-xgboost-2.1.1 | 132 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ######5 | 18%
libexpat-2.6.2 | 62 KB | #########5 | 26%
zlib-1.2.13 | 76 KB | #######7 | 21%
libexpat-2.6.2 | 62 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ####### | 19%
zlib-1.2.13 | 76 KB | ##################################### | 100%
zlib-1.2.13 | 76 KB | ##################################### | 100%
libzlib-1.2.13 | 46 KB | ############9 | 35%
libzlib-1.2.13 | 46 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | #######5 | 21%
_py-xgboost-mutex-2. | 12 KB | ##################################### | 100%
_py-xgboost-mutex-2. | 12 KB | ##################################### | 100%
xgboost-2.1.1 | 15 KB | ##################################### | 100%
xgboost-2.1.1 | 15 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ######## | 22%
python_abi-3.12 | 6 KB | ##################################### | 100%
python_abi-3.12 | 6 KB | ##################################### | 100%
python-3.12.2 | 12.5 MB | ########6 | 23%
python-3.12.2 | 12.5 MB | #########3 | 25%
openssl-3.3.2 | 2.7 MB | ##################9 | 51%
python-3.12.2 | 12.5 MB | #########9 | 27%
python-3.12.2 | 12.5 MB | ##########5 | 29%
python-3.12.2 | 12.5 MB | ########### | 30%
python-3.12.2 | 12.5 MB | ###########7 | 32%
python-3.12.2 | 12.5 MB | ############3 | 33%
python-3.12.2 | 12.5 MB | ############7 | 35%
python-3.12.2 | 12.5 MB | #############5 | 37%
python-3.12.2 | 12.5 MB | ##############3 | 39%
openssl-3.3.2 | 2.7 MB | ##############################4 | 82%
python-3.12.2 | 12.5 MB | ###############5 | 42%
python-3.12.2 | 12.5 MB | ################ | 43%
python-3.12.2 | 12.5 MB | #################2 | 47%
openssl-3.3.2 | 2.7 MB | ##################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Note: you may need to restart the kernel to use updated packages.
In [158]:
#XGBoost
from xgboost import XGBClassifier
xgb_model =XGBClassifier(gamma=0)
xgb_model.fit(X_train, y_train)
Out[158]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In [161]:
xgb_pred = xgb_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))
#XGBoost seems to be doing well with an accuracy score of 0.7795.
Accuracy Score = 0.7265625
In [163]:
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))
[[73 17]
[18 20]]
precision recall f1-score support
0 0.80 0.81 0.81 90
1 0.54 0.53 0.53 38
accuracy 0.73 128
macro avg 0.67 0.67 0.67 128
weighted avg 0.72 0.73 0.73 128
In [165]:
xgb_model.feature_importances_
Out[165]:
array([0.10681806, 0.26690283, 0.07197934, 0.09486822, 0.08104164,
0.15018146, 0.10262704, 0.12558141], dtype=float32)
In [167]:
(pd.Series(xgb_model.feature_importances_).plot(kind='barh'))
Out[167]:
<Axes: >
In [169]:
#Predction
#printing the prediction probabities for the the test data
print('Prediction Probabilities')
xgb_model.predict_proba(X_test)
Prediction Probabilities
Out[169]:
array([[9.92698014e-01, 7.30198948e-03],
[9.98287678e-01, 1.71232747e-03],
[9.98678267e-01, 1.32173719e-03],
[9.62741137e-01, 3.72588895e-02],
[9.95805860e-01, 4.19415860e-03],
[9.99800384e-01, 1.99629183e-04],
[1.69157982e-03, 9.98308420e-01],
[9.95779276e-01, 4.22069523e-03],
[6.82082772e-01, 3.17917198e-01],
[9.93828118e-01, 6.17191056e-03],
[1.81093395e-01, 8.18906605e-01],
[6.96693301e-01, 3.03306669e-01],
[9.96942639e-01, 3.05735902e-03],
[2.92949259e-01, 7.07050741e-01],
[9.61683869e-01, 3.83161381e-02],
[1.66436434e-01, 8.33563566e-01],
[5.87375700e-01, 4.12624300e-01],
[4.59243536e-01, 5.40756464e-01],
[9.22846317e-01, 7.71537125e-02],
[9.92303431e-01, 7.69655732e-03],
[4.71982360e-02, 9.52801764e-01],
[9.95942473e-01, 4.05751402e-03],
[8.86238933e-01, 1.13761090e-01],
[4.71579313e-01, 5.28420687e-01],
[8.06907177e-01, 1.93092853e-01],
[2.81797826e-01, 7.18202174e-01],
[7.76815891e-01, 2.23184139e-01],
[8.83256972e-01, 1.16743043e-01],
[9.99338388e-01, 6.61599974e-04],
[9.88308847e-01, 1.16911745e-02],
[9.88125741e-01, 1.18742343e-02],
[9.96705711e-01, 3.29429726e-03],
[9.98044312e-01, 1.95569103e-03],
[1.18758321e-01, 8.81241679e-01],
[3.25666785e-01, 6.74333215e-01],
[9.89137471e-01, 1.08625386e-02],
[9.85866129e-01, 1.41338641e-02],
[9.99476731e-01, 5.23241761e-04],
[9.99013960e-01, 9.86026134e-04],
[9.73583639e-01, 2.64163371e-02],
[5.72578907e-02, 9.42742109e-01],
[9.97030139e-01, 2.96986313e-03],
[1.24645114e-01, 8.75354886e-01],
[8.22680950e-01, 1.77319065e-01],
[9.99679565e-01, 3.20413412e-04],
[9.56447899e-01, 4.35521156e-02],
[1.15604341e-01, 8.84395659e-01],
[4.43898380e-01, 5.56101620e-01],
[9.64564025e-01, 3.54360007e-02],
[7.71950305e-01, 2.28049681e-01],
[4.06223476e-01, 5.93776524e-01],
[9.97017503e-01, 2.98252259e-03],
[9.62221980e-01, 3.77780497e-02],
[9.87115860e-01, 1.28841205e-02],
[9.92482126e-01, 7.51787145e-03],
[9.71470773e-01, 2.85292454e-02],
[2.41522193e-02, 9.75847781e-01],
[5.61645985e-01, 4.38354015e-01],
[6.84203386e-01, 3.15796584e-01],
[9.92389381e-01, 7.61063257e-03],
[9.80956137e-01, 1.90438889e-02],
[6.91584468e-01, 3.08415532e-01],
[9.93445873e-01, 6.55414443e-03],
[8.46824646e-02, 9.15317535e-01],
[6.86359048e-01, 3.13640922e-01],
[9.89106834e-01, 1.08931940e-02],
[1.31346583e-02, 9.86865342e-01],
[9.97377694e-01, 2.62228516e-03],
[9.07706857e-01, 9.22931656e-02],
[9.84076560e-01, 1.59234330e-02],
[9.06054616e-01, 9.39453840e-02],
[6.35308027e-02, 9.36469197e-01],
[3.59359920e-01, 6.40640080e-01],
[9.84128118e-01, 1.58719067e-02],
[9.76188898e-01, 2.38111205e-02],
[1.32267296e-01, 8.67732704e-01],
[9.99916494e-01, 8.34780512e-05],
[9.99698997e-01, 3.00994056e-04],
[9.36300278e-01, 6.36996925e-02],
[8.14855099e-04, 9.99185145e-01],
[1.40027404e-01, 8.59972596e-01],
[1.83433175e-01, 8.16566825e-01],
[9.92176473e-01, 7.82352127e-03],
[9.22738433e-01, 7.72615969e-02],
[7.93317556e-02, 9.20668244e-01],
[7.03954339e-01, 2.96045661e-01],
[9.94522274e-01, 5.47774415e-03],
[4.80942488e-01, 5.19057512e-01],
[8.43731403e-01, 1.56268626e-01],
[1.10597014e-02, 9.88940299e-01],
[1.41111612e-01, 8.58888388e-01],
[9.18520510e-01, 8.14795047e-02],
[9.79698181e-01, 2.03017965e-02],
[9.40711617e-01, 5.92883751e-02],
[4.03893709e-01, 5.96106291e-01],
[6.05168939e-01, 3.94831091e-01],
[8.15820932e-01, 1.84179068e-01],
[6.85274839e-01, 3.14725190e-01],
[6.28185332e-01, 3.71814668e-01],
[3.16698074e-01, 6.83301926e-01],
[9.44051385e-01, 5.59486113e-02],
[9.98127282e-01, 1.87270797e-03],
[9.77459550e-01, 2.25404389e-02],
[9.84630167e-01, 1.53698223e-02],
[8.09853554e-01, 1.90146461e-01],
[7.96324253e-01, 2.03675762e-01],
[2.09695697e-02, 9.79030430e-01],
[9.98806477e-01, 1.19350385e-03],
[9.37582970e-01, 6.24170527e-02],
[2.94814408e-01, 7.05185592e-01],
[9.35056090e-01, 6.49439394e-02],
[5.82101226e-01, 4.17898804e-01],
[8.78445029e-01, 1.21555001e-01],
[9.97622252e-01, 2.37775035e-03],
[9.32184637e-01, 6.78153485e-02],
[7.75922000e-01, 2.24078014e-01],
[9.89523232e-01, 1.04767652e-02],
[1.01554394e-02, 9.89844561e-01],
[2.14435935e-01, 7.85564065e-01],
[9.80520129e-01, 1.94799006e-02],
[1.81804895e-02, 9.81819510e-01],
[6.28389716e-02, 9.37161028e-01],
[9.95727003e-01, 4.27297084e-03],
[9.26763833e-01, 7.32361525e-02],
[9.17421699e-01, 8.25782716e-02],
[5.73029280e-01, 4.26970750e-01],
[9.91200626e-01, 8.79937690e-03],
[2.17268288e-01, 7.82731712e-01]], dtype=float32)
In [ ]: